1.) Read in the data

defaultW <- getOption("warn") 
options(warn = -1)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#Reading in the data
retail <- read.csv("/Users/lawrence/Google Drive/DS/Time Series/OnlineRetailUCI2-master/online_retail_II.csv")
head(retail)
##   Invoice StockCode                         Description Quantity
## 1  489434     85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS       12
## 2  489434    79323P                  PINK CHERRY LIGHTS       12
## 3  489434    79323W                 WHITE CHERRY LIGHTS       12
## 4  489434     22041        RECORD FRAME 7" SINGLE SIZE        48
## 5  489434     21232      STRAWBERRY CERAMIC TRINKET BOX       24
## 6  489434     22064          PINK DOUGHNUT TRINKET POT        24
##           InvoiceDate Price Customer.ID        Country
## 1 2009-12-01 07:45:00  6.95       13085 United Kingdom
## 2 2009-12-01 07:45:00  6.75       13085 United Kingdom
## 3 2009-12-01 07:45:00  6.75       13085 United Kingdom
## 4 2009-12-01 07:45:00  2.10       13085 United Kingdom
## 5 2009-12-01 07:45:00  1.25       13085 United Kingdom
## 6 2009-12-01 07:45:00  1.65       13085 United Kingdom

2.) Prepare and comment on the following exploratory plots: Total daily, weekly and monthly sales volumes

#Preparing the data
retail %>% 
  separate(InvoiceDate, into=c('Date'), sep = " ", remove=F) %>% 
  separate(InvoiceDate, into=c('MonthYear'), sep = c(7), remove=F) %>%
  mutate(Sales = retail$Quantity * retail$Price)-> retail

aggregate(Sales ~ MonthYear, data=retail, FUN=sum) %>% separate(MonthYear, into=c('Year'), sep = c(4), remove=F) -> monthly_yearly
aggregate(Sales ~ Date, data=retail, FUN=sum) -> daily

val<-c(mean(daily$Sales),median(daily$Sales),max(daily$Sales),min(daily$Sales))
title<-c('Mean','Median','Max','Min')
rbind(title,val)
##       [,1]               [,2]        [,3]        [,4]        
## title "Mean"             "Median"    "Max"       "Min"       
## val   "31932.5340529801" "27481.625" "117271.12" "-22212.609"
colnames(daily)<-c("Date","SalesVolume")
#Daily sales volume plot
ggplot(daily, aes(Date, SalesVolume, group = 1)) + geom_line() + ggtitle("Daily total sales volume")

Sales volume appears to be highly volatile by day, with the mean being at 32,000 and median at 27,000 pounds per day, a few outliers however increasing to up to 117,000 pounds per day (maybe a sale?) and -22000, maybe just after Christmas when everyone is returning their presents.

#Sales volume by month and year
ggplot(monthly_yearly, aes(x=MonthYear, y=Sales)) + geom_bar(stat="identity", fill = monthly_yearly$Year) +ggtitle("Monthly/ Yearly sales volume")

There appears to be a clear monthly pattern with increasing sales towards the end of the year (and the Christmas season).

Last months’ revenue share by product and by customer

retail[retail$MonthYear=='2011-12',] ->lastMonth
lastMonth<-lastMonth[!(lastMonth$StockCode=="DOT" | lastMonth$StockCode=="CRUK" | lastMonth$StockCode=="POST" | lastMonth$StockCode=="C2" | lastMonth$StockCode=="AMAZONFEE"),]   #Remove postage, fees and other sales unrelated expenses 

by_cust <- lastMonth %>% 
    group_by(Customer.ID) %>%
    summarise(TotalSales=sum(Sales)) %>%
    arrange(desc(TotalSales)) %>%  
    mutate(Index = 1:n(), Customer.ID = ifelse(Index > 10, "Others", Customer.ID))

#10 largest customers
ggplot(by_cust[!(by_cust$Customer.ID=="Others" | is.na(by_cust$Customer.ID)),], aes(x = reorder(Customer.ID, -TotalSales), y = TotalSales)) +     geom_bar(stat = "identity", aes(fill = Customer.ID)) +
    ggtitle("Last months’ revenue share by customer (10 largest customers)")

#all customers
ggplot(by_cust, aes(x = reorder(Customer.ID, -TotalSales), y = TotalSales)) + 
    geom_bar(stat = "identity", aes(fill = Customer.ID)) +
    ggtitle("Last months’ revenue share by customer (all customers)")

Sales appear to be highly fragmented, there is no one big customer.

by_prod <- lastMonth %>% 
    group_by(StockCode) %>%
    summarise(TotalSales=sum(Sales)) %>%
    arrange(desc(TotalSales)) %>% mutate(Index = 1:n(), StockCode = ifelse(Index > 10, "Others", StockCode))

#10 products with highest revenue share
ggplot(by_prod[!(by_prod$StockCode=="Others"),], aes(x = reorder(StockCode, -TotalSales), y = TotalSales)) + 
  geom_bar(stat = "identity", aes(fill = StockCode))+
  ggtitle("Last months’ revenue share by product for 10 best selling products")

#All products
ggplot(by_prod, aes(x = reorder(StockCode, -TotalSales), y = TotalSales)) + 
  geom_bar(stat = "identity", aes(fill = StockCode))+
  ggtitle("Last months’ revenue share by product")

sum(by_prod$TotalSales)  #Total sales this month
## [1] 441393.9

Again, sales appear to be highly fragmented by product type, each having monthly sales between 100-10000 pounds, out of total sales of 441393.9 pounds.

Weighted average monthly sale price by volume

retail %>% group_by(MonthYear) %>% summarise(vwap = sum(Sales)/sum(Quantity)) -> VWAP

ggplot(VWAP, aes(MonthYear, vwap, group = 1)) + geom_line() + ggtitle("Weighted average monthly sale price by volume")

There is no obvious trend in the weighted average monthly sale price by volume, it fluctuates between 1.60 to 2.20 pounds.

  1. You’ll note the dataset contains negative volumes, relating to sales returns. Some of these returns relate to products sold before the data collection date, thus should be filtered from the dataset before we use it for modelling. Describe and implement a logical way of performing this task
retail <-read.csv("/Users/lawrence/Google Drive/DS/Time Series/OnlineRetailUCI2-master/retail.csv")

A logical to filter these returns out would be to match them with the initial sales order. This cannot be found directly from the invoice, therefore we would need to match it by customer id, stockcode, absolute value of sales and roughly the same time period.

a)The owner of the online retailer wants to know how much revenue to expect for this month (12/2011), to help him decide what sports car he buys his partner for Christmas.

Outline a few different solutions you might suggest that solve this problem. Include in your description:

What metrics/values you might want to use:

I would suggest using combined total sales volume, be it monthly, weekly or daily, as the response variable. This is far better, easier and more accurate to model (due to CLT and large numbers) than individual purchases by specific customers or products. Doing so would result in a far more complicated model that will not necessarily result in more accurate predictions, especially considering the highly fragmented nature of sales by individual customers or products. I will therefore only look at aggregate sales volume as a response variable, and use the Date as an explanatory variable to capture the underlying trend, and Month to capture recurring monthly sales patterns.

How you would aggregate those metrics:

The aggregation could be done using the aggregate function, or summarize using tidyverse.

What model/algorithm or logic you would use to make a prediction on them:

I would use the linear model lm() function in r with totalsales (monthly, daily or weekly) as the response and Date and as.factor(Month) as the explanatory variables.

What uncertainties you might need to explore I would then look at confidence intervals, r^2, residuals and other goodness of fit measures to determine if this is a good fit. If not, another option would be to use the time series functionalities in R with ts() or holtwinters() functions, or potentially to use a glm() as maybe a poisson or other distribution is better suited, or taking the log of the response variable might be an option to improve it as well given that it is sales data.

Looking at the monthly patterns in total sales volume observed in the two previous years, December was the month with the second highest sales in 2010, unless there is a strong reason to suggest otherwise I would expect a similar pattern in 2011. Additionally, overall sales growth will need to be taken into account to come up with an accurate estimation for this years December sales. We also have data for this year’s December of up to the 9th of December. This could be useful for estimation by comparing it with how the last two years full-December sales data compared with sales up until the 9th of December.

In addition, something like a profit margin, COGS and tax estimation would be helpful to determine if the owner can afford the Ferrari or not as just simple sales revenue does not say too much about the owners’ bottom line.

  1. Select the method you think is the best approach, and explain why. Your justification should weigh up the effort required and expected accuracy.

For starters, I think the lm() is the best approach to start as it is the most simple and easy to implement one, while likely still yielding reasonably good accuracy.

  1. Show an implementation one of your solutions (doesn’t need to be your selected method), and show the final forecast alongside the historical time series.
aggregate(Sales ~ MonthYear, data=retail, FUN=sum) %>% separate(MonthYear,into=c('Year','Month')) %>% select(-c(Year)) -> sales
sales <- sales[-25,] #Remove last incomplete December observation
monthly.lm <- lm(Sales ~ Month, data=sales)
summary(monthly.lm)
## 
## Call:
## lm(formula = Sales ~ Month, data = sales)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -157235  -34335       0   34335  157235 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   592017      60205   9.833 4.29e-07 ***
## Month02       -76440      85143  -0.898 0.386963    
## Month03       132541      85143   1.557 0.145513    
## Month04       -50123      85143  -0.589 0.566987    
## Month05        77312      85143   0.908 0.381742    
## Month06        93438      85143   1.097 0.293989    
## Month07        36252      85143   0.426 0.677810    
## Month08        77712      85143   0.913 0.379362    
## Month09       344652      85143   4.048 0.001616 ** 
## Month10       465920      85143   5.472 0.000142 ***
## Month11       850189      85143   9.985 3.64e-07 ***
## Month12       377194      85143   4.430 0.000821 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 85140 on 12 degrees of freedom
## Multiple R-squared:  0.9483, Adjusted R-squared:  0.9008 
## F-statistic: 19.99 on 11 and 12 DF,  p-value: 4.945e-06
monthly.pred <- predict(monthly.lm, newdata = data.frame(Month='12'), se.fit = T) 

lower<-monthly.pred$fit-1.96*monthly.pred$se.fit
upper<-monthly.pred$fit+1.96*monthly.pred$se.fit
cbind(monthly.pred$fit,lower,upper)
##             lower   upper
## 1 969210.4 851208 1087213

Just looking at historical monthly averages, we’d expect December sales to lie between 857729.8 and 1091312 pounds with 95% confidence.

daily.sales <- retail %>% 
  select(Sales, MonthYear, Date) %>%     
  separate(MonthYear,into=c('Year','Month')) %>% 
  select(-c(Year)) %>%
  group_by(Date) %>% 
  summarise(DailySales=sum(Sales), Month) #Creates Month variable and aggregates daily sales
## `summarise()` has grouped output by 'Date'. You can override using the `.groups` argument.
daily.sales <- unique(daily.sales) #Remove redundant rows
daily.sales$Date<- as.Date(daily.sales$Date) #Convert date to Date object
daily.sales<-daily.sales[-604,] #Exclude the last day as sales don't appear to be complete for that day 

daily.lm<- lm(DailySales ~ Date + Month, data=daily.sales)
summary(daily.lm)
## 
## Call:
## lm(formula = DailySales ~ Date + Month, data = daily.sales)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -45823  -7534   -920   5929  72075 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -69356.212  48194.239  -1.439  0.15065    
## Date             6.349      3.251   1.953  0.05130 .  
## Month02      -3362.758   3088.992  -1.089  0.27676    
## Month03       1801.455   3006.525   0.599  0.54928    
## Month04       -543.902   3167.746  -0.172  0.86373    
## Month05       1874.935   3097.529   0.605  0.54521    
## Month06        748.186   3066.365   0.244  0.80732    
## Month07      -1643.851   3083.512  -0.533  0.59416    
## Month08       -242.390   3103.767  -0.078  0.93778    
## Month09       9825.167   3127.820   3.141  0.00177 ** 
## Month10      14296.506   3154.047   4.533 7.05e-06 ***
## Month11      28883.529   3183.082   9.074  < 2e-16 ***
## Month12      23836.384   3090.205   7.714 5.24e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15130 on 590 degrees of freedom
## Multiple R-squared:  0.3331, Adjusted R-squared:  0.3195 
## F-statistic: 24.55 on 12 and 590 DF,  p-value: < 2.2e-16

Looking at this model, we can see that there appears to be a (weak) sales growth trend, on average sales tend to increase by about 5 pounds on every calendar day. This trend is not very significant however (P-value = 0.08). Using this model to predict remaining sales in December:

month<-c(rep('12',23))
date<-c('2011-12-09','2011-12-10','2011-12-11','2011-12-12','2011-12-13','2011-12-14','2011-12-15','2011-12-16','2011-12-17','2011-12-18','2011-12-19','2011-12-20','2011-12-21','2011-12-22','2011-12-23','2011-12-24','2011-12-25','2011-12-26','2011-12-27','2011-12-28','2011-12-29','2011-12-30','2011-12-31')
dec<-data.frame(date,month)
names(dec) <- c('Date', 'Month')
dec$Date<-as.Date(dec$Date)

daily.pred <- predict(daily.lm, newdata = dec, se.fit = T)

lower<-daily.pred$fit-1.96*daily.pred$se.fit
upper<-daily.pred$fit+1.96*daily.pred$se.fit


prev.sales<- retail %>% filter((Date>'2011-11-30' & Date < '2011-12-09')) %>% summarise(TotalSales = sum(Sales))  #Total sales so far in the up to the 12.12.2011, excluding the last day which does not appear to be complete
prev.sales<-prev.sales[[1]]

cbind(sum(daily.pred$fit)+prev.sales,sum(lower)+prev.sales,sum(upper)+prev.sales) #Predictions for total sales this december, considering existing sales data from the first 8 days
##         [,1]    [,2]    [,3]
## [1,] 1592949 1471916 1713983
last.dec<- retail %>% filter((Date>'2010-11-30' & Date < '2011-01-01')) %>% summarise(TotalSales = sum(Sales)) #Last years december total sales
last.dec[[1]]
## [1] 1126445

This model taking into account the growth trend would predict total sales to lie between 1461205 and 1699748 pounds. This figure seems quite high, especially considering last years total December sales were only 1126445. The model is likely putting too much weight on the sales increase from December 2009 to December 2010.

this.dec.frac <-retail %>% filter((Date>'2011-11-30' & Date < '2011-12-09')) %>% summarise(TotalSales = sum(Sales))  #Dec sales until the 9th this year
last.dec.frac <-retail %>% filter((Date>'2010-11-30' & Date < '2010-12-09')) %>% summarise(TotalSales = sum(Sales))  #Dec sales until the 9th last year
this.dec.frac[[1]]  #Total sales this year in the first eight December days
## [1] 401536.5
last.dec.frac[[1]]  #Total sales last year in the first eight December days
## [1] 649912.6
frac<- last.dec.frac[[1]]/last.dec[[1]] #Fraction of total December sales in the first eight days
frac 
## [1] 0.5769588
this.dec.frac[[1]]/frac   #Extrapolating this percentage on this years first eight days sales.
## [1] 695953.5

Another interesting point is that total sales in the first eight days of December last year were about 650,000 pounds, while this year this figure is only at about 401,000 pounds. Given that last year 57% of December sales occured in these first eight days, this may be cause for concern in accuracy of the model, especially the second one. Looking purely at these numbers, we would only expect about 700,000 pounds of total December sales this year. However, this might just be a coincidence, maybe Christmas shopping is just left a bit late in 2011 compared to 2010.

How confident are you of this forecast - do you back your prediction enough to recommend the new Ferrari?

Based on these estimates, I would give a conservative estimate of Sales to be between 700,000 to 900,000 pounds. Depending on the owners profit margin, this could be enough for the Ferrari, but to be on the cautious side maybe a Toyota would be a better choice for this year.

While the linear model is a very useful multi-purpose tool, let’s now see how a few more specialized approaches designed for time series analysis compare instead. We will be looking at exponential smoothing and models from the ARIMA family.

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tsibble)
## 
## Attaching package: 'tsibble'
## The following object is masked from 'package:lubridate':
## 
##     interval
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(fpp3)
## ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──
## ✓ tsibbledata 0.3.0     ✓ fable       0.3.1
## ✓ feasts      0.2.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## x lubridate::date()    masks base::date()
## x dplyr::filter()      masks stats::filter()
## x tsibble::intersect() masks base::intersect()
## x tsibble::interval()  masks lubridate::interval()
## x dplyr::lag()         masks stats::lag()
## x tsibble::setdiff()   masks base::setdiff()
## x tsibble::union()     masks base::union()
library(zoo)
## 
## Attaching package: 'zoo'
## The following object is masked from 'package:tsibble':
## 
##     index
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
#Unfortunately, there appear to be some missing days with no sales. Since this leads to problems with some
#forecasting methods, I will impute these missing days using complete() and na.approx(). This fills in missing
#days' sales with the average of the previous and the following days' sales.

retail %>% 
  select(Sales, Date, MonthYear) %>%
  separate(MonthYear,into=c('Year','Month')) %>%
  select(-c(Year)) %>% group_by(Date) %>% 
  summarise(Sales=sum(Sales),Month = as.integer(Month)) %>% unique() -> daily.sales
## `summarise()` has grouped output by 'Date'. You can override using the `.groups` argument.
daily.sales %>%
  mutate(Date = as.Date(Date)) %>%
  complete(Date = seq.Date(min(Date), max(Date), by="day")) -> daily.sales

daily.sales <- as_tsibble(daily.sales,index=Date) %>% fill_gaps()

daily.sales$Sales <- na.approx(daily.sales$Sales)
daily.sales$Month <- na.approx(daily.sales$Month)

autoplot(daily.sales, Sales) +
  labs(y = "Daily sales in dollars",
       title = "Daily sales")

Again, as noted before, there do appear to be some seasonal patterns going on but no clear upwards or downwards trend. Let’s see what a model decomposition into trend, seasonal and cyclical components can reveal about the data. We have the choice between using an additive or multiplicative model, depending on the structure of the data. Given that we don’t see a clear trend in the data, mostly seasonal variation, an additive model seems appropriate for now.

dcmp <- daily.sales %>%
  model(
    classical_decomposition(Sales, type = "additive")
  )
components(dcmp)
## # A dable: 739 x 7 [1D]
## # Key:     .model [1]
## # :        Sales = trend + seasonal + random
##    .model                Date        Sales  trend seasonal  random season_adjust
##    <chr>                 <date>      <dbl>  <dbl>    <dbl>   <dbl>         <dbl>
##  1 "classical_decomposi… 2009-12-01 54177.    NA     5791.     NA         48386.
##  2 "classical_decomposi… 2009-12-02 63217.    NA     1762.     NA         61455.
##  3 "classical_decomposi… 2009-12-03 73959.    NA     7575.     NA         66384.
##  4 "classical_decomposi… 2009-12-04 40624. 44466.    -875.  -2966.        41500.
##  5 "classical_decomposi… 2009-12-05  9803. 43504.   -6409. -27292.        16212.
##  6 "classical_decomposi… 2009-12-06 24482. 40244.  -11524.  -4237.        36007.
##  7 "classical_decomposi… 2009-12-07 44999. 35870.    3680.   5448.        41318.
##  8 "classical_decomposi… 2009-12-08 47444. 35556.    5791.   6097.        41653.
##  9 "classical_decomposi… 2009-12-09 40397. 38476.    1762.    158.        38634.
## 10 "classical_decomposi… 2009-12-10 43342. 38131.    7575.  -2364.        35767.
## # … with 729 more rows
components(dcmp) %>% autoplot()

The decompostion appears to confirm our initial observation of the absence of any significant, multi-year trend. There does however seem to be a strong yearly, cyclical pattern.

A good benchmark for time series data to compare more complex models to is the naive model, which simply predicts that future values will be equal to the most recent observation. There is also a seasonal naive variant, which does the same but adjusted for seasonality. Let’s fit these two simple models and see how we can improve on them further. I will use the last 30 days of data as a test set and the remainder as training set:

#Split into training and test set
train <- daily.sales %>%
  filter_index(~  as.character(max(daily.sales$Date) - 30))

test <- daily.sales %>%
  filter_index(as.character(max(daily.sales$Date) - 29) ~ .)

#Fit naive and seasonal naive models
sales_fit_naive <- train %>% model(`Naïve` = NAIVE(Sales))
sales_fit_snaive <- train %>% model(`Seasonal naïve` = SNAIVE(Sales))

# Generate forecasts for 30 days
sales_fc_naive <- sales_fit_naive %>% forecast(h = 30)
sales_fc_snaive <- sales_fit_snaive %>% forecast(h = 30)

# Plot forecasts against actual values
sales_fc_naive %>%
  autoplot(train, level = NULL) +
  autolayer(
    test,
    colour = "grey", alpha=0.6
  ) +
  labs(
    y = "Daily Sales (in Pounds)",
    title = "Forecasts for final 30 days of sales"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
## `mutate_if()` ignored the following grouping variables:
## Column `Date`
## Plot variable not specified, automatically selected `.vars = Sales`

sales_fc_snaive %>%
  autoplot(train, level = NULL) +
  autolayer(
    test,
    colour = "grey", alpha=0.6
  ) +
  labs(
    y = "Daily Sales (in Pounds)",
    title = "Forecasts for final 30 days of sales"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
## `mutate_if()` ignored the following grouping variables:
## Column `Date`
## Plot variable not specified, automatically selected `.vars = Sales`

Not a bad first model. Let’s take a look at the residuals to see if there is more data we can capture in our model:

daily.sales %>%
  model(NAIVE(Sales)) %>%
  gg_tsresiduals()

daily.sales %>%
  model(NAIVE(Sales)) %>% augment() %>% features(.innov, ljung_box, lag = 730, dof = 0)
## # A tibble: 1 x 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 NAIVE(Sales)   2928.         0

Both the acf plot and the ljung_box test indicate that there is in fact significant auto-correlation in the residuals, indicating that we should proceed and try fit a more complex model. Let’s see how they behave for the seasonal naive model.

daily.sales %>%
  model(SNAIVE(Sales)) %>%
  gg_tsresiduals()

daily.sales %>%
  model(SNAIVE(Sales)) %>% augment() %>% features(.innov, ljung_box, lag = 730, dof = 0)
## # A tibble: 1 x 3
##   .model        lb_stat lb_pvalue
##   <chr>           <dbl>     <dbl>
## 1 SNAIVE(Sales)   1188.         0

Again, we see significant auto-correlation with the lags, and a highly significant p-value in the ljung box test. Let’s proceed with a few more complex models. Let’s take a look at some performance metrics, in particular RMSE and MAE.

rbind(accuracy(sales_fc_naive, daily.sales),accuracy(sales_fc_snaive, daily.sales))
## # A tibble: 2 x 10
##   .model         .type     ME   RMSE    MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>          <chr>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 Naïve          Test  -8320. 20213. 15954. -30.8  39.4  1.35 1.19  0.0885
## 2 Seasonal naïve Test  -4025. 14868. 12173. -16.9  26.6  1.03 0.877 0.0826

We can see that the SNaive model has a much lower error than the naive one. Let’s see how a few more complex models compare to this.

Linear Time Series model

sales_lm <- train %>%
  model(TSLM(Sales ~ trend() + season()))  #Trend + day of the week
report(sales_lm)
## Series: Sales 
## Model: TSLM 
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57171  -9615  -3251   6736  86395 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    33517.289   1845.132  18.165  < 2e-16 ***
## trend()            9.604      2.852   3.368 0.000799 ***
## season()week2  -8812.329   2186.869  -4.030 6.20e-05 ***
## season()week3 -14094.636   2186.875  -6.445 2.15e-10 ***
## season()week4 -18926.444   2186.884  -8.655  < 2e-16 ***
## season()week5  -4703.573   2186.897  -2.151 0.031833 *  
## season()week6  -2066.016   2181.505  -0.947 0.343935    
## season()week7  -6155.879   2181.513  -2.822 0.004910 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15540 on 701 degrees of freedom
## Multiple R-squared: 0.1507,  Adjusted R-squared: 0.1423
## F-statistic: 17.77 on 7 and 701 DF, p-value: < 2.22e-16
sales_lm_fc <- sales_lm %>% forecast(h = 30)
sales_lm_fc %>%
  autoplot(train) +
  labs(
    title = "Forecasts of daily sales using regression",
    y = "Sales in Pounds"
  )
## `mutate_if()` ignored the following grouping variables:
## Column `Date`

The linear model appears to be putting too much weight on the average sales value from earlier in the year.

accuracy(sales_lm_fc,daily.sales)
## # A tibble: 1 x 10
##   .model              .type     ME   RMSE    MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>               <chr>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 TSLM(Sales ~ trend… Test  21399. 26136. 21465.  35.3  35.6  1.82  1.54 -0.0412

The linear model’s prediction therefore is quite far off, performing even worse than the naive model.

Exponential Smoothing (ETS)/ Holt- Winters

Let’s take a look at how a model from the exponential smoothing (ETS) family of models performs instead. With Holt-Winters models, we have the choice of selecting either an additive or multiplicative model. The right choice here depends on whether there exists a (exponentially increasing) trend or the data looks rather flat. In this case, either model could work as the graph looks mostly flat, but there appears to be a slight trend in the later half of the year. Let’s try fit both models, as well as an automatically determined model.

hw_fit <- train %>%
  model(
    additive = ETS(Sales ~ error("A") + trend("A") + season("A")),
    multiplicative = ETS(Sales ~ error("M") + trend("A") + season("M")),
    auto = ETS(Sales)
  )
report(hw_fit)
## # A tibble: 3 x 9
##   .model           sigma2 log_lik    AIC   AICc    BIC      MSE     AMSE     MAE
##   <chr>             <dbl>   <dbl>  <dbl>  <dbl>  <dbl>    <dbl>    <dbl>   <dbl>
## 1 additive        1.49e+8  -8993. 18009. 18010. 18064.   1.47e8   1.63e8 8.38e+3
## 2 multiplica…     1.78e-1  -8922. 17867. 17867. 17922.   1.45e8   1.59e8 3.06e-1
## 3 auto            1.48e+8  -8991. 18002. 18002. 18048.   1.46e8   1.61e8 8.39e+3

All three information criteria (AIC,AICc and BIC) seem pretty close to each other for all three models. Let’s see how their forecasts compare.

fc_hw <- hw_fit %>% forecast(h = 30)

fc_hw %>%
  autoplot(train, level = NULL) +
  autolayer(
    test,
    colour = "grey", alpha=0.6
  ) +
  labs(
    y = "Daily Sales (in Pounds)",
    title = "Holt-Winters forecasts for final 30 days of sales"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
## `mutate_if()` ignored the following grouping variables:
## Column `Date`
## Plot variable not specified, automatically selected `.vars = Sales`

Both models appear to be quite similar to the seasonal naive approach, without any strong trend or cyclical component. This perhaps makes sense since we were not able to identify any significant trends in the training data.

accuracy(fc_hw, daily.sales)
## # A tibble: 3 x 10
##   .model         .type     ME   RMSE    MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>          <chr>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 additive       Test  -4283. 15979. 12118. -18.9  27.8 1.03  0.943 -0.0107
## 2 auto           Test  -2705. 15508. 11484. -15.5  25.7 0.975 0.915 -0.0218
## 3 multiplicative Test  -3214. 15055. 10363. -11.7  20.1 0.880 0.888 -0.192

All three models are a clear improvement to the naive models, which is a great start. Our intuitive preference for choosing an additive over a multiplicative model seems to be confirmed by the data, with the additive model yielding a RMSE of 13212 and the multiplicative one of 14443. The automatically determined model however performs better than both of them, with a RMSE of 11610.

ARIMA model

Let’s see how another popular group, the ARIMA family of models, performs on this dataset. ARIMA is another very popular approach for time series forecasting. The idea here is, as opposed to exponential smoothing where the predictions are based on trend and seasonality of the data, to make predictions based on autocorrelation of future observations with the past.

The ARIMA class of models requires time series to be made stationary in order to make useful predictions. Stationarity refers to the series not containing any trend or seasonality, and rather just have random white noise or unpredictable cycles. Variance should also be roughly constant, if it is not a transformation such as taking the logarithm of the series or a Box-Cox transformation can be used.

In many time series, this is not the case. In these situations, a technique called differencing can be used to make a series stationary. There are two common forms of differencing: taking the first order difference, which simply refers to subtracting the previous observation from every observation. This makes the time series essentially a series of changes from the last observation. In the example of stock prices, this means instead of a time series of stock prices it simply becomes a time series of daily stock price changes, which in a lot of cases tend to be stable. The second common form is taking the seasonal difference, if there is a strong seasonal pattern. If the pattern is quarterly, for example, this means subtracting the most recent quarter from each quarterly observation, making the time series essentially a series of quarterly changes.

Checking for stationarity can be done by inspecting the time series plot, as well as looking at ACF and PACF plots. Having seen the time plot above, it appears that differencing might be a good idea. Let’s take a look at the first order difference:

daily.sales %>%  gg_tsdisplay(difference(Sales) , plot_type='partial')

It appears that there is a weekly pattern that the data is autocorrelated with. Let’s also take the weekly difference now:

daily.sales %>%
  gg_tsdisplay(difference(Sales, 7) %>% difference(),
               plot_type='partial') +
  labs(title = "Double differenced", y="")

Looking at the plot, a 1st order MA model with a seasonal MA(1) might be appropriate. Let’s see how this works. I will try fit a ARIMA(0,1,2)(0,1,1) model first, and also see how an automatically fitted model performs as well.

fit <- train %>%
  model(
    arima012011 = ARIMA(Sales ~ pdq(0,1,2) + PDQ(0,1,1)),
    arima210011 = ARIMA(Sales ~ pdq(2,1,0) + PDQ(0,1,1)),
    arima312011 = ARIMA(Sales ~ pdq(3,1,2) + PDQ(0,1,1)),
    auto = ARIMA(Sales, stepwise = FALSE, approx = FALSE) #By setting stepwise and approx to false we make the algorithm work a little harder to find the right model
  )
fc_arima <- fit %>% forecast(h=30)
fc_arima %>%
  autoplot(train, level = NULL) +
  autolayer(
    test,
    colour = "grey", alpha=0.6
  ) +
  labs(
    y = "Daily Sales (in Pounds)",
    title = "Forecasts for final 30 days of sales (ARIMA forecast)"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
## `mutate_if()` ignored the following grouping variables:
## Column `Date`
## Plot variable not specified, automatically selected `.vars = Sales`

accuracy(fc_arima, daily.sales)
## # A tibble: 4 x 10
##   .model      .type     ME   RMSE    MAE    MPE  MAPE  MASE RMSSE    ACF1
##   <chr>       <chr>  <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 arima012011 Test  -3351. 15393. 11228. -16.1   25.0 0.953 0.908 -0.0433
## 2 arima210011 Test  -6913. 16608. 13163. -23.4   30.2 1.12  0.980 -0.0392
## 3 arima312011 Test   2608. 14856.  9886.  -3.26  19.0 0.839 0.877 -0.0904
## 4 auto        Test   4378. 17792. 13323.  -3.10  26.9 1.13  1.05   0.132

The arima312011 model has a RMSE of 14856 on the test data. Interestingly, the automatically selected model performs the worst. Letting the computer do all the thinking sometimes can be dangerous it appears.

Neural Network model

I will introduce one more model into this analysis, which is based on fitting a neural network on the lags of the data to predict future values. I will be using the NNETAR() function for this:

train %>%
  model(nn102 = NNETAR(Sales,p=10,P=2),
        nnauto = NNETAR(Sales)
        ) -> nn_fit
fc_nn <- nn_fit %>% forecast(h = 30)

fc_nn %>%
  autoplot(train, level = NULL) +
  autolayer(
    test,
    colour = "grey", alpha=0.6
  ) +
  labs(
    y = "Daily Sales (in Pounds)",
    title = "Forecasts for final 30 days of sales (Neural Net forecast)"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
## `mutate_if()` ignored the following grouping variables:
## Column `Date`
## Plot variable not specified, automatically selected `.vars = Sales`

Visually it appears as if both models quite nicely capture some of the seasonal spikes in the data. However, they appear to be falling down a bit too fast again.

accuracy(fc_nn, daily.sales)
## # A tibble: 2 x 10
##   .model .type    ME   RMSE    MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <chr> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 nn102  Test  6280. 22974. 18879.  3.19  34.8  1.60  1.36 0.499
## 2 nnauto Test  3170. 19369. 16101. -2.09  29.7  1.37  1.14 0.338

The accuracy of the Neural network unfortunately isn’t quite as high as the graph might have suggested. I will be sticking to the exponential smoothing model for my final forecast.

Final prediction

final_fit <- daily.sales[1:738,] %>% model(ETS(Sales))
report(final_fit)
## Series: Sales 
## Model: ETS(A,N,A) 
##   Smoothing parameters:
##     alpha = 0.277415 
##     gamma = 0.0001002125 
## 
##   Initial states:
##      l[0]     s[0]     s[-1]    s[-2]     s[-3]    s[-4]    s[-5]    s[-6]
##  43290.35 3675.734 -9856.863 -6413.32 -880.5316 7569.498 1757.187 4148.296
## 
##   sigma^2:  152421520
## 
##      AIC     AICc      BIC 
## 18790.17 18790.47 18836.21
fc_ets <- final_fit %>% forecast(h=23)
fc_ets %>%
  autoplot(daily.sales, level = NULL) + labs(
    y = "Daily Sales (in Pounds)",
    title = "Final forecast for December 2011 sales (ETS)"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
## `mutate_if()` ignored the following grouping variables:
## Column `Date`

Let’s take a look back at our initial linear model and see how this compares:

Looking back at the linear model

Let’s come back to our linear model that I fitted last time and see how this compares to these specialised time series approaches. I will refit the model using the same training/ test split that I have used for the other two sets to ensure comparability.

pred_df <- data.frame(date,daily.pred$fit) %>%  
  mutate(date = as.Date(date))
pred_df
##          date daily.pred.fit
## 1  2011-12-09       51730.72
## 2  2011-12-10       51737.07
## 3  2011-12-11       51743.42
## 4  2011-12-12       51749.76
## 5  2011-12-13       51756.11
## 6  2011-12-14       51762.46
## 7  2011-12-15       51768.81
## 8  2011-12-16       51775.16
## 9  2011-12-17       51781.51
## 10 2011-12-18       51787.86
## 11 2011-12-19       51794.21
## 12 2011-12-20       51800.56
## 13 2011-12-21       51806.91
## 14 2011-12-22       51813.26
## 15 2011-12-23       51819.61
## 16 2011-12-24       51825.95
## 17 2011-12-25       51832.30
## 18 2011-12-26       51838.65
## 19 2011-12-27       51845.00
## 20 2011-12-28       51851.35
## 21 2011-12-29       51857.70
## 22 2011-12-30       51864.05
## 23 2011-12-31       51870.40
autoplot(daily.sales, Sales) +
  labs(y = "Daily sales in dollars",
       title = "Daily sales") +
  autolayer(
    as_tsibble(pred_df),
    colour = "blue"
  )
## Using `date` as index variable.
## Plot variable not specified, automatically selected `.vars = daily.pred.fit`

Visually, the linear model looks very similar to the naive model which simply takes the last observation as its forecasted value.

# #Split into training and test set

train <- daily.sales[1:708,]
test <- daily.sales[709:738,]
test  %>% select(-c(Sales)) -> newdata

fit <- train %>%
  model(
    lm = TSLM(Sales ~ Month + Date)
  )
newdata
## # A tsibble: 30 x 2 [1D]
## # Groups:    @ Date [30]
##    Date       Month
##    <date>     <dbl>
##  1 2011-11-09    11
##  2 2011-11-10    11
##  3 2011-11-11    11
##  4 2011-11-12    11
##  5 2011-11-13    11
##  6 2011-11-14    11
##  7 2011-11-15    11
##  8 2011-11-16    11
##  9 2011-11-17    11
## 10 2011-11-18    11
## # … with 20 more rows
fc <- forecast(fit, new_data = newdata)

train %>%
  autoplot(Sales) +
  autolayer(fc) +
  labs(title = "Daily Sales", y = "Sales (in pounds)")

accuracy(fc, daily.sales)
## # A tibble: 1 x 10
##   .model .type     ME   RMSE    MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <chr>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 lm     Test  15338. 23674. 18718.  18.6  32.1  1.59  1.40 0.171

Our RMSE goodness of fit statistic confirms this visual impression. The exponential smoothing approach, purpose built for time series forecasting, cutting the linear model’s RMSE almost in half.

Conclusion

prev.sales<- retail %>% filter((Date>'2011-11-30' & Date < '2011-12-09')) %>% summarise(TotalSales = sum(Sales))  #Total sales so far in the up to the 12.12.2011, excluding the last day which does not appear to be complete
prev.sales<-prev.sales[[1]]

print("Total projected sales for December 2011 using ETS:");sum(fc_ets$.mean)+prev.sales
## [1] "Total projected sales for December 2011 using ETS:"
## [1] 1793260

Using the exponential smoothing forecast model we come to a total projected sales figure of 1,793,260 pounds.